####DataScreening Script. Class2#### ##load data## #set your working directory to the path where your input files are found setwd('~/Documents/Macs folder UW/courses/FISH 560 multivariate/course materials') #to read in biostats I went to file>source file biostats #to read in libraries I need for this exercise library(vegan) library(permute) library(pastecs) library(boot) #read in data envdata<-read.csv('env.csv',header=TRUE,row.names=1) spedata<-read.csv('abund.csv',header=TRUE,row.names=1) #make sure file read in ok head(spedata) #calc summary stats for envdata and spe data stat.desc(envdata) stat.desc(spedata) #create new objects with column1 removed (it's non-numeric so it's mad, this will make it easier than putting in the exception every time) spedata<-spedata[,-1] ##evaluate missing data## #evaluate missing data, replace missing data with the median testdata<-replace.missing(spedata) #replace missing data with mean instead testdata<-replace.missing(spedata,method='mean') # drop a variable if >5% of observations are missing testdata<-drop.var(spedata,pct.missing=5) ##examine freq of occurance and abundance graphs## #frequence of abundance; 10 graphs total foa.plots(spedata) ##screen variables for 'sufficiency' and eliminate if necessary #drop rare species with non-zero values in less than 5% of sites testdata<-drop.var(spedata,min.po=5) #drop rare species with fewer than 5 non-zero values testdata<-drop.var(spedata,min.fo=5) #drop abundant generalist species, those in >95% of plots testdata<-drop.var(spedata,max.po=95) #drop variables with too little variation, those w/ cv<5% testdata<-drop.var(spedata,min.cv=5) ##Examine distributional properties of the data## #ECDF=empiricle cumulative distribution functions (here a variable with uniform distribution will have perfect diagonal stright line, deviations indicate non-uniformity) ecdf.plots(spedata) #conventional histogram hist.plots(spedata) #box and whisker plot box.plots(spedata) #normal quantile-quantile plots qqnorm.plots(spedata) #those plots above can all be depicted TOGETHER uv.plots(spedata) ##Evaluate the need for data transformation## #tranform data to log(base 10) data.trans(spedata,method='log') #transform data 'power transformation' (including binary transformation) this is a square root transformation data.trans(spedata,method='power',exp=0.5) #to transform into presence/absence (i.e. binary transformation), use power method with an exponent equto to zero data.trans(spedata,method='power',exp=0) #transform data using arcsine square root transformation, this is for PROPORTIONAL data (i.e. ranges 0-1) data.trans(spedata,method='asin') #ONCE you've decided on TRANSFORMATION, save the transformed data as a new object. Here are 2 examples of saving a natural log transformation tradata<-data.trans(spedata,method='log',plot=F) #note this next one does the same as one above, but need to add the +1 whereas the data.trans function includes it automatically tradata<-log10(spedata+1) ##Evaluate the Need for Data Standardization## #standardization adjust for scale differences between variables (e.g. flow=1-10,000, temp=1-30) also can adjust for differences between individuals or site (row standardization) #to decide if you want to standardize, check the cvs for the variables(by column and row) - this was done previously, but here is command again: stat.desc(spedata) #here is an example of a row normalization (i.e. rescale each row so that sums equal 1) data.stand(spedata,method='total',margin='row') ##SAVE A TRANSFORMED DATASET AS AN OUTFILE## #as an alternative, you can just rename it as a new object and do analyses from there #here I have it writing the object: tradata into a file named tradata, separated by commas, missing values written as NA and keeping row names write.table(tradata,file='tradata',sep=',',na='NA',row.names=TRUE) ##EVALUTATING THE DATASET FOR OUTLIERS## #uv (univariate) computes the standardized values (i.e. z-scores) and returns a data frame with a list of samples and variables that are greater than a specified # of standard deviations (usually 3, but 1 in this example from illustrative purposes) uv.outliers(envdata, id='Sinuosity:BasinAre', var='Elev', sd.limit=1) #mv (multivariate) outliers based on eulidean distance same note here(usually 3 sd, but 1 in this example from illustrative purposes) mv.outliers(envdata,method='euclidean', sd.limit=1) #mv, another way is to use Mahalanobis distance ..uuhhh mv.outliers(envdata,method='mahalanobis',sd.limit=1) ##Exercise## #need my own data first then #1. summary stats stat.desc(envdata) #2a. do outliers exist? mv.outliers(envdata,method='euclidean',sd.limit=3) #2b.